## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (0.12.10)
## than is installed on your system (0.12.13). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
## Warning: replacing previous import 'BiocGenerics::var' by 'stats::var' when
## loading 'MLInterfaces'
# # Read data in
# f <- "../rawdata/P164_TMT_10_plex_efficiency_PeptideGroups.txt"
# e <- grepEcols(f, "Abundance: F", split = "\t")
# xf <- readMSnSet2(f, e, sep = "\t")
# save(xf, file = "../rscript/P164_TMT_10_plex_efficiency_PeptideGroups.rda")
load("../rscript/P164_TMT_10_plex_efficiency_PeptideGroups.rda")
# How many high quality peptides are modified
hqp <- sum(fData(xf)$Confidence == "High")
hqpm <- sum(fData(xf)$Confidence == "High" & grepl("TMT", fData(xf)$Modifications))
perc <- round(hqpm / hqp * 100, 0)Incorporation of the tags was evaluated by the number of peptides with the respective modification in the N-terminal and/or lysine residues1.
The labelling efficiency is: 98%.
# # Read data in
# f <- "../rawdata/P164_TMT_10_plex_result_Proteins.txt"
# e <- grepEcols(f, "Abundance F", split = "\t")
# x <- readMSnSet2(f, e, sep = "\t")
# save(x, file = "../rscript/P164_TMT_10_plex_result_Proteins.rda")
# Read saved data
load("../rscript/P164_TMT_10_plex_result_Proteins.rda")
## Data annotation
# create pData
Sample <- gsub("Abundance\\.F1\\.\\d*[C|N]\\.Sample\\.(\\w*\\.\\d)\\.", "\\1", colnames(exprs(x)))
Sample <- gsub("Abundance\\.F1\\.\\d*\\.Sample\\.(\\w*\\.\\d)\\.", "\\1", Sample)
tmp <- data.frame(do.call(rbind, strsplit(Sample, "[.]")))
names(tmp) <- c("Sample", "Replicate")
pData(x) <- tmp
# Rename rows and colomns
sampleNames(x) <- paste(x$Sample, x$Replicate, sep = "_")
featureNames(x) <- fData(x)$Accession
## Number of high confidance proteins
cp <- data.frame(table(fData(x)$Protein.FDR.Confidence))
colnames(cp)[1] <- "Protein.FDR.Confidence"
tmp <- x[fData(x)$Protein.FDR.Confidence != "Low",]The rawdata file contains 6161 proteins. The protein FDR confidance for these proteins can be:
High [<0.01]
Medium [<0.05]
Low [>0.05]
From now on the analysis will be carry on with high and medium confidance proteins.
The new dataset contains 6073 proteins.
## Plot missing values
naplot(tmp)x <- filterNA(tmp, 0)All the missing values have been removed. The analysis will only includes the proteins with a value for each channel.
The dataset used for the analysis contains 5795 proteins.
The protein quantification values have been log2 transformed and a PC analysis has been performed to visualise the data correlation.
x <- log(x,2)
plot2D(t(x), fcol = "Sample", cex = 2, addLegend = "bottomleft")The follow plot shows the distribution and correlation for every sample, with all the other samples.
library(psych)##
## Attaching package: 'psych'
## The following object is masked from 'package:IRanges':
##
## reflect
pairs.panels(exprs(x))The two plots show the data correlation. Control samples are clustering together very well, while the PERK samples seem to be more disperse.
boxplot(exprs(x), col = rep(c("yellow", "blue"), each = 5))The boxplot shows that the CONTR distributions have very similar median, while the PERK samples show more variability.
Samples have been normalised using each channel median.
x <- normalise(x, "diff.median")boxplot(exprs(x), col = rep(c("yellow", "blue"), each = 5))### There is a negative point!!!!plot2D(t(x), fcol = "Sample", cex = 2, addLegend = "bottomleft")After normalisation the PCA plot looks better, with a clear separation on the PC1 of the two samples.
Here you can see the normalised data. This dataset is the one use for the differential expression analysis.
To perform the DEA we used limma package2.
library(limma)
library(plotly)
## Differential analysis
## Model matrices
mat <- model.matrix(~ 0 + pData(x)$Sample)
colnames(mat) <- c("CONTR", "PERK")
pData(x)$cmat <- mat
## Functions
dostats <- function(x) {
cont.matrix <- makeContrasts(diffexp = PERK - CONTR,
levels = pData(x)[, "cmat"])
## Statistical analysis
fit <- lmFit(exprs(x), pData(x)[, "cmat"])
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
## Result tables
tt1 <- topTable(fit2, adjust.method = "BH", coef = "diffexp", number = Inf)
}
## Statistical analysis
res <- dostats(x)
## Prepare data to be saved
fData(x)$diffexp <- res[featureNames(x), ]
dfr <- cbind(data.frame(exprs(x)),
descr = fData(x)$Description,
logFC = fData(x)$diffexp[, "logFC"],
adj.P.Val = fData(x)$diffexp[, "adj.P.Val"])
# write.csv(dfr, "../analysis/proteins_medium.csv")
plot_ly(dfr, x = ~logFC, y = ~-log10(adj.P.Val), type = "scatter",
text = ~paste("ID: ", rownames(dfr),
"\nDesc: ", descr,
"\nFC: ", round(logFC, 2),
"\n-log10 adj.Pval: ", round(-log10(adj.P.Val), 3),
"\nadjusted pvalue: ", adj.P.Val))In order to directly compare identified proteins and transcripts, the proteins correspondent transcripts have been identified on UniProt website.
The match was many times 1 protein to many transcripts.
The transcripts and proteins identifiers have been joined based on trascript IDs.
The following table reports the correspondences:
# Load proteomics data with transcripts (file generated by rscript/uniprot.R)
load("../rscript/prot_medium_to_trans.rda")
res[, 3:4] <- round(res[, 3:4], 2)
# Load transcriptomics data
tran <- read.csv("../rawdata/Perk flies (Affy proj 2489M).csv")
tran$regt <- ifelse(tran$FC > 2 & tran$FDR < 0.01, 1,
ifelse(tran$FC < -2 & tran$FDR < 0.01, -1, 0))
# Combine the 2 datasets
library(dplyr)
y <- inner_join(tran, res, by = "Transcript.ID")
yf <- full_join(tran, res, by = "Transcript.ID")
# Reorder the columns
y <- y %>% dplyr::select(1, 2, 6, 3:5, 7:9)
yf <- yf %>% dplyr::select(1, 2, 6, 3:5, 7:9)
# Rename column
names(y)[3:9] <- c("Protein.ID", "trans_FC", "trans_FDR", "trans_reg", "prot_FC",
"prot_FDR", "prot_reg")
names(yf)[3:9] <- c("Protein.ID", "trans_FC", "trans_FDR", "trans_reg", "prot_FC",
"prot_FDR", "prot_reg")The follow table contains only the proteins and transcript IDs common between the proteomics and transcriptomics datasets.
# Table
datatable(y,
rownames = TRUE,
filter = "top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv', 'copy'))
)While here it is possible to check the proteins/transcripts regulations:
Up regulation: 1
No regulation: 0
Down regulation: -1
# Compare the proteins/transcipts regulation
tt <- table(y$prot_reg, y$trans_reg)
names(dimnames(tt)) <- c("Proteomics", "Transcriptomics")
# Table
datatable(tt,
rownames = FALSE,
# filter = "top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv', 'copy'))
)# Compare differentially regulated proteins between LEAVES and ROOTS
y$ind <- 0
for (i in 1:dim(y)[1]){
if (y$prot_reg[i] > 0) {
y$ind[i] <- 1
}
if (y$prot_reg[i] < 0) {
y$ind[i] <- 2
}
if (y$trans_reg[i] > 0) {
y$ind[i] <- 3
}
if (y$trans_reg[i] < 0) {
y$ind[i] <- 4
}
if (y$prot_reg[i] > 0 & y$trans_reg[i] > 0) {
y$ind[i] <- 5
}
if (y$prot_reg[i] < 0 & y$trans_reg[i] < 0) {
y$ind[i] <- 6
}
if (y$prot_reg[i] > 0 & y$trans_reg[i] < 0) {
y$ind[i] <- 7
}
if (y$prot_reg[i] < 0 & y$trans_reg[i] > 0) {
y$ind[i] <- 8
}
}
y <- y %>% arrange((ind))
library(ggthemes)
p1 <- ggplot(y,aes(prot_FC,trans_FC,colour = factor(ind)))
p1 + geom_point(size=1) +
theme_few() +
labs(x = "Proteomics FC",
y = "Transcriptomics FC",
title = "Compare proteomics-transcriptomics expression") +
scale_colour_tableau(
# scale_colour_manual(values = cbPalette,
name = "Differential expressed\nbetween proteins and transcripts",
labels=c("Not regulated",
"Up only in proteomics",
"Down only in proteomics",
"Up only in transcriptomics",
"Down only in transcriptomics",
"Up in proteomics and transcriptomicss",
"Down in proteomics and transcriptomics",
"Up in proteomics and down in transcriptomics",
"Down in proteomics and up in transcriptomics"))The follow table contains all the transcripts and all the identified proteins.
# Table
datatable(yf,
rownames = TRUE,
filter = "top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv', 'copy'))
)sessionInfo()## R version 3.4.2 (2017-09-28)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggthemes_3.4.0 dplyr_0.7.4 bindrcpp_0.2
## [4] plotly_4.7.1 ggplot2_2.2.1 limma_3.32.10
## [7] psych_1.7.8 pRoloc_1.16.1 MLInterfaces_1.56.0
## [10] cluster_2.0.6 annotate_1.54.0 XML_3.98-1.9
## [13] AnnotationDbi_1.38.2 IRanges_2.10.5 S4Vectors_0.14.7
## [16] DT_0.2 MSnbase_2.2.0 ProtGenerics_1.8.0
## [19] BiocParallel_1.10.1 mzR_2.10.0 Rcpp_0.12.13
## [22] Biobase_2.36.2 BiocGenerics_0.22.1 rmarkdown_1.6
## [25] nvimcom_0.9-34
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.1 plyr_1.8.4 igraph_1.1.2
## [4] lazyeval_0.2.0 splines_3.4.2 ggvis_0.4.3
## [7] crosstalk_1.0.0 digest_0.6.12 foreach_1.4.3
## [10] BiocInstaller_1.26.1 htmltools_0.3.6 viridis_0.4.0
## [13] gdata_2.18.0 magrittr_1.5 memoise_1.1.0
## [16] doParallel_1.0.11 sfsmisc_1.1-1 recipes_0.1.0
## [19] gower_0.1.2 rda_1.0.2-2 dimRed_0.1.0
## [22] lpSolve_5.6.13 colorspace_1.3-2 blob_1.1.0
## [25] jsonlite_1.5 RCurl_1.95-4.8 hexbin_1.27.1
## [28] lme4_1.1-14 genefilter_1.58.1 bindr_0.1
## [31] impute_1.50.1 survival_2.41-3 iterators_1.0.8
## [34] glue_1.1.1 DRR_0.0.2 gtable_0.2.0
## [37] ipred_0.9-6 zlibbioc_1.22.0 MatrixModels_0.4-1
## [40] kernlab_0.9-25 ddalpha_1.3.1 prabclus_2.2-6
## [43] DEoptimR_1.0-8 scales_0.5.0 vsn_3.44.0
## [46] mvtnorm_1.0-6 DBI_0.7 viridisLite_0.2.0
## [49] xtable_1.8-2 foreign_0.8-69 bit_1.1-12
## [52] proxy_0.4-17 mclust_5.3 preprocessCore_1.38.1
## [55] lava_1.5.1 prodlim_1.6.1 httr_1.3.1
## [58] htmlwidgets_0.9 sampling_2.8 threejs_0.3.1
## [61] FNN_1.1 RColorBrewer_1.1-2 fpc_2.1-10
## [64] modeltools_0.2-21 pkgconfig_2.0.1 flexmix_2.3-14
## [67] nnet_7.3-12 caret_6.0-77 labeling_0.3
## [70] rlang_0.1.2 reshape2_1.4.2 munsell_0.4.3
## [73] mlbench_2.1-1 tools_3.4.2 RSQLite_2.0
## [76] pls_2.6-0 evaluate_0.10.1 stringr_1.2.0
## [79] mzID_1.14.0 yaml_2.1.14 ModelMetrics_1.1.0
## [82] knitr_1.17 bit64_0.9-7 robustbase_0.92-7
## [85] randomForest_4.6-12 purrr_0.2.3 dendextend_1.5.2
## [88] nlme_3.1-131 whisker_0.3-2 mime_0.5
## [91] RcppRoll_0.2.2 biomaRt_2.32.1 compiler_3.4.2
## [94] e1071_1.6-8 affyio_1.46.0 tibble_1.3.4
## [97] stringi_1.1.5 lattice_0.20-35 trimcluster_0.1-2
## [100] Matrix_1.2-11 nloptr_1.0.4 gbm_2.1.3
## [103] MALDIquant_1.16.4 data.table_1.10.4-2 bitops_1.0-6
## [106] httpuv_1.3.5 R6_2.2.2 pcaMethods_1.68.0
## [109] affy_1.54.0 hwriter_1.3.2 gridExtra_2.3
## [112] codetools_0.2-15 MASS_7.3-47 gtools_3.5.0
## [115] assertthat_0.2.0 CVST_0.2-1 rprojroot_1.2
## [118] withr_2.0.0 mnormt_1.5-5 diptest_0.75-7
## [121] grid_3.4.2 rpart_4.1-11 timeDate_3012.100
## [124] tidyr_0.7.2 minqa_1.2.4 class_7.3-14
## [127] shiny_1.0.5 lubridate_1.6.0 base64enc_0.1-3
1. Nogueira, F. C. S. et al. Performance of Isobaric and Isotopic Labeling in Quantitative Plant Proteomics. Journal of Proteome Research 11, 3046–3052 (2012).
2. Ritchie ME, W. D., Phipson B & GK, S. Limma powers differential expression analyses for rna-sequencing and microarray studies. Nucleic Acids Research 43, e47 (2015).